import requests
import folium
import pickle
import re
import string
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import pandas as pd
from folium import plugins
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
df = pickle.load(open("datafiles/cleandata.ft", "rb"))
df.head()
| totPurchaseAmt | latitude | longitude | bathrooms | photoCount | bedrooms | livingArea | yearBuilt | averageSchoolRating | zip | description | zestimate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 198000.0 | 45.237189 | -93.409535 | 2.0 | 0 | 4.0 | 1716.0 | 1996 | 5.333333 | 55303 | 14649 Iodine Ct NW, Ramsey, MN is a single fam... | 285985.0 |
| 3 | 415000.0 | 45.278217 | -93.407533 | 2.0 | 30 | 3.0 | 3108.0 | 1985 | 6.333333 | 55303 | Stunning walk to Rum River and Creek running t... | 431995.0 |
| 4 | 329900.0 | 45.143781 | -93.021604 | 3.0 | 0 | 4.0 | 1814.0 | 2001 | 6.500000 | 55038 | 2372 Tart Lake Rd, Lino Lakes, MN is a single ... | 318162.0 |
| 5 | 262000.0 | 45.164166 | -93.297836 | 2.0 | 36 | 3.0 | 2158.0 | 1985 | 4.666667 | 55043 | This large rambler is located on a beautiful, ... | 272169.0 |
| 6 | 280000.0 | 45.283700 | -93.332023 | 2.0 | 22 | 3.0 | 1993.0 | 1976 | 7.333333 | 55304 | You won't want to miss out on this beautiful h... | 290365.0 |
len(df)
28078
def remove_mypunct(corpus):
corpus = map(lambda x: re.sub(r"(#|@|http)\S+", "", x), corpus) #get rid of hashtags
corpus = map(lambda x: re.sub(r"(\t|\n|\v|…|“|”)", "", x), corpus) #get rid of whitesape
corpus = map(lambda x: re.sub(r"""\w*\d\w*""", ' ', x.lower()), corpus) #get rid of numbers
punc_re = re.compile('[%s]' % re.escape(string.punctuation))
corpus = map(lambda x: punc_re.sub(' ', x), corpus)
corpus = map(lambda x: re.sub('[\W_]+', " ", x), corpus) #get rid of emojis
return list(corpus)
sid = SentimentIntensityAnalyzer()
score_tuples = []
score = []
for i in df.index.values:
clean_house = remove_mypunct([df.description[i]])
ss = sid.polarity_scores(clean_house[0])
df.at[i, 'sentiment'] = float(ss['compound'])
df = df.drop("description", axis=1)
print ("Maximum Sentiment Score:", df["sentiment"].max())
print ("Minimum Sentiment Score:", df["sentiment"].min())
print ("Average Sentiment of Description:", df["sentiment"].mean())
Maximum Sentiment Score: 0.9992 Minimum Sentiment Score: -0.8934 Average Sentiment of Description: 0.5923133342830685
numericalData = df.drop(columns=["zip", "latitude", "longitude"])
corr = numericalData.corr() # plot the heatmap
plt.subplots(figsize=(10,10))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True,
cmap=sns.diverging_palette(220, 20, as_cmap=True))
<AxesSubplot:>
We see that totPurchaseAmt and zestimate have a strong positive correlation. That is expected since the zillow price prediction model is built on the totPurchaseAmt dataset
We see that the price of a house is positively correlated to the living area and the number of bathrooms.
As expected we see that an increase in number of bathrooms in a house also increases the totPurchaseAmt.
df.plot(kind='scatter', x='zestimate', y='totPurchaseAmt')
<AxesSubplot:xlabel='zestimate', ylabel='totPurchaseAmt'>
sns.scatterplot(x='livingArea', y='totPurchaseAmt', data=df)
<AxesSubplot:xlabel='livingArea', ylabel='totPurchaseAmt'>
sns.scatterplot(x='bathrooms', y='totPurchaseAmt', data=df)
<AxesSubplot:xlabel='bathrooms', ylabel='totPurchaseAmt'>
sns.scatterplot(x='livingArea', y='bathrooms', data=df)
<AxesSubplot:xlabel='livingArea', ylabel='bathrooms'>
numListings = df.groupby('zip').count()
numListingsByZip = pd.DataFrame()
numListingsByZip["zip"] = [str(i) for i in numListings.index]
numListingsByZip["numListings"] = numListings["totPurchaseAmt"].values
jsonData = requests.get("https://raw.githubusercontent.com/OpenDataDE/State-zip-code-GeoJSON/master/mn_minnesota_zip_codes_geo.min.json")
mnArea = jsonData.json()
mnMap = folium.Map(location=[44.9778, -93.2650], tiles="Stamen Toner", zoom_start=8)
mnMap.choropleth(geo_data=mnArea, data=numListingsByZip, columns=["zip", "numListings"],
key_on='feature.properties.ZCTA5CE10', fill_color='YlGn', fill_opacity=1)
mnMap